import plotly
plotly.offline.init_notebook_mode()
Problem is the risk analysis of patients with diabetes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
# Load the diabetes dataset
data_diabetes = datasets.load_diabetes()
data_diabetes
{'data': array([[ 0.03807591, 0.05068012, 0.06169621, ..., -0.00259226,
0.01990749, -0.01764613],
[-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
-0.06833155, -0.09220405],
[ 0.08529891, 0.05068012, 0.04445121, ..., -0.00259226,
0.00286131, -0.02593034],
...,
[ 0.04170844, 0.05068012, -0.01590626, ..., -0.01107952,
-0.04688253, 0.01549073],
[-0.04547248, -0.04464164, 0.03906215, ..., 0.02655962,
0.04452873, -0.02593034],
[-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
-0.00422151, 0.00306441]]),
'target': array([151., 75., 141., 206., 135., 97., 138., 63., 110., 310., 101.,
69., 179., 185., 118., 171., 166., 144., 97., 168., 68., 49.,
68., 245., 184., 202., 137., 85., 131., 283., 129., 59., 341.,
87., 65., 102., 265., 276., 252., 90., 100., 55., 61., 92.,
259., 53., 190., 142., 75., 142., 155., 225., 59., 104., 182.,
128., 52., 37., 170., 170., 61., 144., 52., 128., 71., 163.,
150., 97., 160., 178., 48., 270., 202., 111., 85., 42., 170.,
200., 252., 113., 143., 51., 52., 210., 65., 141., 55., 134.,
42., 111., 98., 164., 48., 96., 90., 162., 150., 279., 92.,
83., 128., 102., 302., 198., 95., 53., 134., 144., 232., 81.,
104., 59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
173., 180., 84., 121., 161., 99., 109., 115., 268., 274., 158.,
107., 83., 103., 272., 85., 280., 336., 281., 118., 317., 235.,
60., 174., 259., 178., 128., 96., 126., 288., 88., 292., 71.,
197., 186., 25., 84., 96., 195., 53., 217., 172., 131., 214.,
59., 70., 220., 268., 152., 47., 74., 295., 101., 151., 127.,
237., 225., 81., 151., 107., 64., 138., 185., 265., 101., 137.,
143., 141., 79., 292., 178., 91., 116., 86., 122., 72., 129.,
142., 90., 158., 39., 196., 222., 277., 99., 196., 202., 155.,
77., 191., 70., 73., 49., 65., 263., 248., 296., 214., 185.,
78., 93., 252., 150., 77., 208., 77., 108., 160., 53., 220.,
154., 259., 90., 246., 124., 67., 72., 257., 262., 275., 177.,
71., 47., 187., 125., 78., 51., 258., 215., 303., 243., 91.,
150., 310., 153., 346., 63., 89., 50., 39., 103., 308., 116.,
145., 74., 45., 115., 264., 87., 202., 127., 182., 241., 66.,
94., 283., 64., 102., 200., 265., 94., 230., 181., 156., 233.,
60., 219., 80., 68., 332., 248., 84., 200., 55., 85., 89.,
31., 129., 83., 275., 65., 198., 236., 253., 124., 44., 172.,
114., 142., 109., 180., 144., 163., 147., 97., 220., 190., 109.,
191., 122., 230., 242., 248., 249., 192., 131., 237., 78., 135.,
244., 199., 270., 164., 72., 96., 306., 91., 214., 95., 216.,
263., 178., 113., 200., 139., 139., 88., 148., 88., 243., 71.,
77., 109., 272., 60., 54., 221., 90., 311., 281., 182., 321.,
58., 262., 206., 233., 242., 123., 167., 63., 197., 71., 168.,
140., 217., 121., 235., 245., 40., 52., 104., 132., 88., 69.,
219., 72., 201., 110., 51., 277., 63., 118., 69., 273., 258.,
43., 198., 242., 232., 175., 93., 168., 275., 293., 281., 72.,
140., 189., 181., 209., 136., 261., 113., 131., 174., 257., 55.,
84., 42., 146., 212., 233., 91., 111., 152., 120., 67., 310.,
94., 183., 66., 173., 72., 49., 64., 48., 178., 104., 132.,
220., 57.]),
'frame': None,
'DESCR': '.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n :Number of Instances: 442\n\n :Number of Attributes: First 10 columns are numeric predictive values\n\n :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n :Attribute Information:\n - age age in years\n - sex\n - bmi body mass index\n - bp average blood pressure\n - s1 tc, total serum cholesterol\n - s2 ldl, low-density lipoproteins\n - s3 hdl, high-density lipoproteins\n - s4 tch, total cholesterol / HDL\n - s5 ltg, possibly log of serum triglycerides level\n - s6 glu, blood sugar level\n\nNote: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times the square root of `n_samples` (i.e. the sum of squares of each column totals 1).\n\nSource URL:\nhttps://www4.stat.ncsu.edu/~boos/var.select/diabetes.html\n\nFor more information see:\nBradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani (2004) "Least Angle Regression," Annals of Statistics (with discussion), 407-499.\n(https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)\n',
'feature_names': ['age',
'sex',
'bmi',
'bp',
's1',
's2',
's3',
's4',
's5',
's6'],
'data_filename': 'diabetes_data_raw.csv.gz',
'target_filename': 'diabetes_target.csv.gz',
'data_module': 'sklearn.datasets.data'}
df_diabetes = pd.DataFrame(data_diabetes.data,columns=data_diabetes.feature_names)
df_diabetes['target'] = data_diabetes.target
df_diabetes.head()
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019907 | -0.017646 | 151.0 |
| 1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068332 | -0.092204 | 75.0 |
| 2 | 0.085299 | 0.050680 | 0.044451 | -0.005670 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002861 | -0.025930 | 141.0 |
| 3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022688 | -0.009362 | 206.0 |
| 4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031988 | -0.046641 | 135.0 |
df_describe = df_diabetes.describe()
df_describe
| age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 442.000000 |
| mean | -2.511817e-19 | 1.230790e-17 | -2.245564e-16 | -4.797570e-17 | -1.381499e-17 | 3.918434e-17 | -5.777179e-18 | -9.042540e-18 | 9.293722e-17 | 1.130318e-17 | 152.133484 |
| std | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 77.093005 |
| min | -1.072256e-01 | -4.464164e-02 | -9.027530e-02 | -1.123988e-01 | -1.267807e-01 | -1.156131e-01 | -1.023071e-01 | -7.639450e-02 | -1.260971e-01 | -1.377672e-01 | 25.000000 |
| 25% | -3.729927e-02 | -4.464164e-02 | -3.422907e-02 | -3.665608e-02 | -3.424784e-02 | -3.035840e-02 | -3.511716e-02 | -3.949338e-02 | -3.324559e-02 | -3.317903e-02 | 87.000000 |
| 50% | 5.383060e-03 | -4.464164e-02 | -7.283766e-03 | -5.670422e-03 | -4.320866e-03 | -3.819065e-03 | -6.584468e-03 | -2.592262e-03 | -1.947171e-03 | -1.077698e-03 | 140.500000 |
| 75% | 3.807591e-02 | 5.068012e-02 | 3.124802e-02 | 3.564379e-02 | 2.835801e-02 | 2.984439e-02 | 2.931150e-02 | 3.430886e-02 | 3.243232e-02 | 2.791705e-02 | 211.500000 |
| max | 1.107267e-01 | 5.068012e-02 | 1.705552e-01 | 1.320436e-01 | 1.539137e-01 | 1.987880e-01 | 1.811791e-01 | 1.852344e-01 | 1.335973e-01 | 1.356118e-01 | 346.000000 |
df_diabetes.hist(figsize=(12,10))
plt.show()
df_diabetes_corr = df_diabetes.corr()
print(df_diabetes_corr)
plt.figure(figsize=(12,10))
sns.heatmap(df_diabetes_corr, annot=True)
plt.title('Heatmap of Correlation Matrix of Diabetes Dataset')
plt.show()
age sex bmi bp s1 s2 s3 \
age 1.000000 0.173737 0.185085 0.335428 0.260061 0.219243 -0.075181
sex 0.173737 1.000000 0.088161 0.241010 0.035277 0.142637 -0.379090
bmi 0.185085 0.088161 1.000000 0.395411 0.249777 0.261170 -0.366811
bp 0.335428 0.241010 0.395411 1.000000 0.242464 0.185548 -0.178762
s1 0.260061 0.035277 0.249777 0.242464 1.000000 0.896663 0.051519
s2 0.219243 0.142637 0.261170 0.185548 0.896663 1.000000 -0.196455
s3 -0.075181 -0.379090 -0.366811 -0.178762 0.051519 -0.196455 1.000000
s4 0.203841 0.332115 0.413807 0.257650 0.542207 0.659817 -0.738493
s5 0.270774 0.149916 0.446157 0.393480 0.515503 0.318357 -0.398577
s6 0.301731 0.208133 0.388680 0.390430 0.325717 0.290600 -0.273697
target 0.187889 0.043062 0.586450 0.441482 0.212022 0.174054 -0.394789
s4 s5 s6 target
age 0.203841 0.270774 0.301731 0.187889
sex 0.332115 0.149916 0.208133 0.043062
bmi 0.413807 0.446157 0.388680 0.586450
bp 0.257650 0.393480 0.390430 0.441482
s1 0.542207 0.515503 0.325717 0.212022
s2 0.659817 0.318357 0.290600 0.174054
s3 -0.738493 -0.398577 -0.273697 -0.394789
s4 1.000000 0.617859 0.417212 0.430453
s5 0.617859 1.000000 0.464669 0.565883
s6 0.417212 0.464669 1.000000 0.382483
target 0.430453 0.565883 0.382483 1.000000
from sklearn.model_selection import train_test_split
# We are taking only BMI & Taget columns because it has already stated that
# BMI is the independent variable and tage is the dependent variable.
X = df_diabetes[['bmi']]
y= df_diabetes['target']
X_train, X_old, y_train, y_old = train_test_split(X, y, test_size=0.3)
X_val,X_test,y_val,y_test = train_test_split(X_old, y_old, test_size=0.5)
print(X_train.shape, X_val.shape, X_test.shape)
(309, 1) (66, 1) (67, 1)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def create_poly_model(X,y,degrees):
models = {}
for degree in degrees:
model = Pipeline([('polynomial', PolynomialFeatures(degree=degree)),
('linear', LinearRegression())])
model.fit(X, y)
models[degree] = model
return models
degrees = list(range(0, 6))
models = create_poly_model(X_train, y_train, degrees)
# print models
for degree, model in models.items():
print(f'Degree: {degree}')
print(f'Model: {model}\n')
Degree: 0
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=0)),
('linear', LinearRegression())])
Degree: 1
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
('linear', LinearRegression())])
Degree: 2
Model: Pipeline(steps=[('polynomial', PolynomialFeatures()),
('linear', LinearRegression())])
Degree: 3
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=3)),
('linear', LinearRegression())])
Degree: 4
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=4)),
('linear', LinearRegression())])
Degree: 5
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=5)),
('linear', LinearRegression())])
from sklearn.metrics import r2_score, mean_absolute_error
# MAPE function
def mape(y_act, y_pred):
return np.mean(np.abs((y_act - y_pred) / y_act)) * 100
for degree, model in models.items():
# Predictions for train and validation set
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
print(f'Degree: {degree}')
# Train Data
print(f' Train R2: {r2_score(y_train, y_train_pred)}')
print(f' Train MAE: {mean_absolute_error(y_train, y_train_pred)}')
print(f' Train MAPE: {mape(y_train, y_train_pred)}\n')
# Validation Data
print(f' Val R2: {r2_score(y_val, y_val_pred)}')
print(f' Val MAE: {mean_absolute_error(y_val, y_val_pred)}')
print(f' Val MAPE: {mape(y_val, y_val_pred)}\n')
Degree: 0
Train R2: 0.0
Train MAE: 63.50733653815942
Train MAPE: 59.58147132189093
Val R2: -0.02955490098489255
Val MAE: 68.57119741100324
Val MAPE: 62.47693581109562
Degree: 1
Train R2: 0.3468251148566268
Train MAE: 50.36855418910475
Train MAPE: 45.737787569038275
Val R2: 0.2256254729488586
Val MAE: 56.50933674689731
Val MAPE: 49.20006808168624
Degree: 2
Train R2: 0.34682520948663176
Train MAE: 50.368315074360446
Train MAPE: 45.73754242992912
Val R2: 0.2255975105856577
Val MAE: 56.51124502126944
Val MAPE: 49.20155695263188
Degree: 3
Train R2: 0.34686782034381825
Train MAE: 50.32425283843269
Train MAPE: 45.692923263547186
Val R2: 0.2248781164960707
Val MAE: 56.56343809008254
Val MAPE: 49.24369812392924
Degree: 4
Train R2: 0.3481049919424113
Train MAE: 50.17500402983993
Train MAPE: 45.43122002324573
Val R2: 0.2227122771160418
Val MAE: 56.84880526558141
Val MAPE: 49.49210413226087
Degree: 5
Train R2: 0.35815333765033563
Train MAE: 49.75626892720575
Train MAPE: 44.92705706177226
Val R2: 0.22449317094016064
Val MAE: 55.95407571564134
Val MAPE: 49.053396985860054
from sklearn.metrics import mean_squared_error
models[1].fit(X_test,y_test) # 5 is the degree of the polynomial model in the models dictionary
y_test_pred = models[1].predict(X_test)
# Evaluating the model with test data
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
print('Test Data')
print(f'R2 score: {r2_test}')
print(f'MSE: {mse_test}')
print(f'MAE: {mae_test}')
Test Data R2 score: 0.4451011602363443 MSE: 3855.4386220341016 MAE: 50.40507077901542
# Plotting the model
plt.figure(figsize=(10,8))
plt.scatter(X_train, y_train, color='blue', label='Train Data')
plt.scatter(X_val, y_val, color='red', label='Validation Data')
plt.scatter(X_test, y_test, color='green', label='Test Data')
plt.plot(X_test, y_test_pred, color='black', label='Degree 1 test data')
plt.plot(X_train, models[1].predict(X_train), color='yellow', label='Degree 1 train data')
plt.plot(X_val, models[1].predict(X_val), color='orange', label='Degree 1 val data')
plt.title('Polynomial Regression with model degree 1')
plt.xlabel('BMI')
plt.ylabel('Target')
plt.legend()
plt.show()
def print_pipeline_model_stats(model):
# print model
print(f'Model: {model}')
print(f'Coefficients: {model[-1].coef_}')
print(f'Intercept: {model[-1].intercept_}')
# generate equation string:
equation = 'y = '
for i, coef in enumerate(model[-1].coef_):
equation += f'{coef:.2f} * x^{i} + '
equation += f'{model[-1].intercept_:.2f}'
print(f'Equation: {equation}')
print_pipeline_model_stats(models[1])
Model: Pipeline(steps=[('polynomial', PolynomialFeatures(degree=1)),
('linear', LinearRegression())])
Coefficients: [ 0. 1195.15601307]
Intercept: 161.77667890765423
Equation: y = 0.00 * x^0 + 1195.16 * x^1 + 161.78
bmi_manual = 0.05
y_pred_manual = 1195.16 * (bmi_manual)**1 + 161.78
print(y_pred_manual)
y_model = models[1].predict([[bmi_manual]])
print(y_model)
221.538 [221.53447956]
d:\AI & ML\ML FOUNDATION\LABS\CSCN8010\venv\CSCN8010_classic_ml\Lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but PolynomialFeatures was fitted with feature names
trainable_params = {}
for degree in range(6):
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X_train)
params_count = X_poly.shape[1]
trainable_params[degree] = params_count
print(f'Degree {degree}: {poly.get_feature_names_out()}')
degrees = list(trainable_params.keys())
params = list(trainable_params.values())
print('Degrees:', degrees)
print('Trainable Parameters:', params)
Degree 0: ['1'] Degree 1: ['1' 'bmi'] Degree 2: ['1' 'bmi' 'bmi^2'] Degree 3: ['1' 'bmi' 'bmi^2' 'bmi^3'] Degree 4: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4'] Degree 5: ['1' 'bmi' 'bmi^2' 'bmi^3' 'bmi^4' 'bmi^5'] Degrees: [0, 1, 2, 3, 4, 5] Trainable Parameters: [1, 2, 3, 4, 5, 6]